import os
print (os.getcwd())
/home/data/vip8t02/Project/PanMyeloid-assignment/report
from sklearn import metrics
import warnings
warnings.filterwarnings("ignore")
import numpy as np
import pandas as pd
import scanpy as sc
import matplotlib.pyplot as plt
%matplotlib inline
sc.settings.verbosity = 3 # errors (0), warnings (1), info (2), hints (3)
sc.settings.set_figure_params(dpi=80)
sc.logging.print_versions()
scanpy==1.4.3 anndata==0.7.6 umap==0.3.9 numpy==1.21.2 scipy==1.7.1 pandas==1.3.3 scikit-learn==0.24.2 statsmodels==0.13.0 python-igraph==0.9.7
Finished QC in R:
For other collected scRNA-seq datasets, we applied the same filtering steps to 10X Genomics datasets (CRC, KIDNEY, STAD, CRC, HCC, NPC and PAAD).
CRC_ann = sc.read_h5ad("../processedData/CRC.processsed/CRC-myeloid-filtered.h5ad")
CRC_ann
AnnData object with n_obs × n_vars = 13795 × 15820
obs: 'n_counts', 'n_genes', 'cancer', 'patient', 'tissue', 'tech', 'tech_10X', 'percent_mito', 'percent_hsp', 'MajorCluster', 'Sub_Cluster'
var: 'features'
After quality control, we applied the library-size correction method to normalize the raw count by using
normalize_totalfunction in Scanpy. Then the logarithmized normalized count matrix was used for the downstream analysis.
Actually, there are two main approaches to normalize single cell data:
One is a simple linear scaling to adjust counts such that each cell has about the same total library size. Examples include converting to counts per million (CPM) which does a reasonable job of correcting for differences in library size.
Another methods are more complex, and generally involve parametric modeling of count data to perform nonlinear normalization. These methods are useful when there are more complex sources of unwanted variation (e.g., for highly heterogeneous populations of cells with different sizes).
We usually just stick to the simple, but still need extract attention for some spectial situations. Here is a review Cole,Michael B.et al., Cell Systems, 201930080-8?_returnURL=https%3A%2F%2Flinkinghub.elsevier.com%2Fretrieve%2Fpii%2FS2405471219300808%3Fshowall%3Dtrue#secsectitle0020) for the comparison of normalization methods.
# normalize with counts per million, excluding highly expressed genes from the size factor calculation.
sc.pp.normalize_per_cell(CRC_ann, counts_per_cell_after=1e4)
# logaritmize
sc.pp.log1p(CRC_ann)
PAAD = pd.read_csv("../rawData/PAAD/GSE154763_PAAD_normalized_expression.csv.gz", index_col=0)
cellinfo = pd.read_csv("../rawData/PAAD/GSE154763_PAAD_metadata.csv.gz", index_col=0)
geneinfo = pd.DataFrame(PAAD.columns,index=PAAD.columns,columns=['genes_index'])
PAAD_ann = sc.AnnData(PAAD, obs=cellinfo, var = geneinfo)
PAAD_ann
AnnData object with n_obs × n_vars = 2853 × 14140
obs: 'percent_mito', 'n_counts', 'percent_hsp', 'barcode', 'batch', 'library_id', 'cancer', 'patient', 'tissue', 'n_genes', 'MajorCluster', 'source', 'tech', 'UMAP1', 'UMAP2'
var: 'genes_index'
MYE = pd.read_csv("../rawData/MYE/GSE154763_MYE_normalized_expression.csv.gz", index_col=0)
cellinfo = pd.read_csv("../rawData/MYE/GSE154763_MYE_metadata.csv.gz", index_col=0)
geneinfo = pd.DataFrame(MYE.columns,index=MYE.columns,columns=['genes_index'])
MYE_ann = sc.AnnData(MYE, obs=cellinfo, var = geneinfo)
MYE_ann
AnnData object with n_obs × n_vars = 7619 × 15634
obs: 'batch', 'patient', 'tissue', 'percent_hsp', 'percent_mito', 'n_genes', 'n_counts', 'MajorCluster', 'source', 'tech', 'cancer', 'UMAP1', 'UMAP2'
var: 'genes_index'
ESCA = pd.read_csv("../rawData/ESCA/GSE154763_ESCA_normalized_expression.csv.gz", index_col=0)
cellinfo = pd.read_csv("../rawData/ESCA/GSE154763_ESCA_metadata.csv.gz", index_col=0)
geneinfo = pd.DataFrame(ESCA.columns,index=ESCA.columns,columns=['genes_index'])
ESCA_ann = sc.AnnData(ESCA, obs=cellinfo, var = geneinfo)
ESCA_ann
AnnData object with n_obs × n_vars = 7673 × 15550
obs: 'percent_mito', 'n_counts', 'percent_hsp', 'barcode', 'batch', 'library_id', 'cancer', 'patient', 'tissue', 'n_genes', 'MajorCluster', 'source', 'tech', 'UMAP1', 'UMAP2'
var: 'genes_index'
UCEC = pd.read_csv("../rawData/UCEC/GSE154763_UCEC_normalized_expression.csv.gz", index_col=0)
cellinfo = pd.read_csv("../rawData/UCEC/GSE154763_UCEC_metadata.csv.gz", index_col=0)
geneinfo = pd.DataFrame(UCEC.columns,index=UCEC.columns,columns=['genes_index'])
UCEC_ann = sc.AnnData(UCEC, obs=cellinfo, var = geneinfo)
UCEC_ann
AnnData object with n_obs × n_vars = 8808 × 15849
obs: 'percent_mito', 'n_counts', 'percent_hsp', 'barcode', 'batch', 'library_id', 'cancer', 'patient', 'tissue', 'n_genes', 'MajorCluster', 'source', 'tech', 'UMAP1', 'UMAP2'
var: 'genes_index'
OVFTC = pd.read_csv("../rawData/OV-FTC/GSE154763_OV-FTC_normalized_expression.csv.gz", index_col=0)
cellinfo = pd.read_csv("../rawData/OV-FTC/GSE154763_OV-FTC_metadata.csv.gz", index_col=0)
geneinfo = pd.DataFrame(OVFTC.columns,index=OVFTC.columns,columns=['genes_index'])
OVFTC_ann = sc.AnnData(OVFTC, obs=cellinfo, var = geneinfo)
OVFTC_ann
AnnData object with n_obs × n_vars = 3888 × 14008
obs: 'percent_mito', 'n_counts', 'percent_hsp', 'barcode', 'batch', 'library_id', 'cancer', 'patient', 'tissue', 'n_genes', 'MajorCluster', 'source', 'tech', 'UMAP1', 'UMAP2'
var: 'genes_index'
LYM = pd.read_csv("../rawData/LYM/GSE154763_LYM_normalized_expression.csv.gz", index_col=0)
cellinfo = pd.read_csv("../rawData/LYM/GSE154763_LYM_metadata.csv.gz", index_col=0)
geneinfo = pd.DataFrame(LYM.columns,index=LYM.columns,columns=['genes_index'])
LYM_ann = sc.AnnData(LYM, obs=cellinfo, var = geneinfo)
LYM_ann
AnnData object with n_obs × n_vars = 615 × 11283
obs: 'batch', 'patient', 'tissue', 'percent_hsp', 'percent_mito', 'n_genes', 'n_counts', 'MajorCluster', 'source', 'tech', 'cancer', 'UMAP1', 'UMAP2'
var: 'genes_index'
newdata = LYM_ann.concatenate(PAAD_ann,OVFTC_ann,UCEC_ann,ESCA_ann,MYE_ann)
print(newdata.obs['cancer'].value_counts())
UCEC 8808 ESCA 7673 MYE 7619 OV-FTC 3888 PAAD 2853 LYM 615 Name: cancer, dtype: int64
newdata.obs.drop(columns=['source','UMAP1','UMAP2','batch','library_id','barcode'],inplace = True)
newdata.obs['tech_10X']="10X5"
newdata.obs['MajorCluster2']="tmp"
newdata.obs['tech']="10X"
import re
def defineMajor(x):
if bool(re.search("pDC",x)):
return "pDC"
elif bool(re.search("cDC",x)):
return "cDC"
elif bool(re.search("Macro|Mono|TAM",x)):
return "Mono/Macro"
elif bool(re.search("Mast",x)):
return "Mast"
for index in range(newdata.obs.shape[0]):
newdata.obs.iloc[index,10] = defineMajor(newdata.obs.iloc[index,6])
newdata.obs.rename(columns={'MajorCluster':'Sub_Cluster','MajorCluster2':'MajorCluster'},inplace=True)
newdata.obs
| patient | tissue | percent_hsp | percent_mito | n_genes | n_counts | Sub_Cluster | tech | cancer | tech_10X | MajorCluster | |
|---|---|---|---|---|---|---|---|---|---|---|---|
| index | |||||||||||
| AACACGTGTTTGGGCC-15-0 | P20181123 | T | 0.006040 | 0.000000 | 4789 | 29802.0 | M02_cDC1_CLEC9A | 10X | LYM | 10X5 | cDC |
| AACCGCGAGTTCGCAT-15-0 | P20181123 | T | 0.003356 | 0.000000 | 1641 | 5065.0 | M06_Macro_ISG15 | 10X | LYM | 10X5 | Mono/Macro |
| AACTGGTAGCCACTAT-15-0 | P20181123 | T | 0.000368 | 0.000000 | 922 | 2716.0 | M06_Macro_ISG15 | 10X | LYM | 10X5 | Mono/Macro |
| AAGGCAGTCGTCGTTC-15-0 | P20181123 | T | 0.004203 | 0.000000 | 1511 | 4759.0 | M06_Macro_ISG15 | 10X | LYM | 10X5 | Mono/Macro |
| AAGGTTCAGATAGGAG-15-0 | P20181123 | T | 0.016225 | 0.000000 | 1350 | 3698.0 | M03_cDC2_CD1C | 10X | LYM | 10X5 | cDC |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| TTTATGCGTCGAGATG-25-5 | P20190322 | T | 0.003601 | 0.031445 | 1316 | 4166.0 | M09_Macro_C1QC | 10X | MYE | 10X5 | Mono/Macro |
| TTTATGCGTGAGGGTT-25-5 | P20190322 | T | 0.004241 | 0.032207 | 1930 | 7545.0 | M09_Macro_C1QC | 10X | MYE | 10X5 | Mono/Macro |
| TTTCCTCCAAGCCTAT-25-5 | P20190322 | T | 0.009058 | 0.042044 | 1821 | 5851.0 | M09_Macro_C1QC | 10X | MYE | 10X5 | Mono/Macro |
| TTTCCTCGTGGACGAT-25-5 | P20190322 | T | 0.003082 | 0.053721 | 1056 | 2271.0 | M09_Macro_C1QC | 10X | MYE | 10X5 | Mono/Macro |
| TTTGTCAAGTCACGCC-25-5 | P20190322 | T | 0.007208 | 0.026855 | 2857 | 14150.0 | M09_Macro_C1QC | 10X | MYE | 10X5 | Mono/Macro |
31456 rows × 11 columns
print(newdata.obs['MajorCluster'].value_counts())
Mono/Macro 25833 cDC 3344 Mast 1829 pDC 450 Name: MajorCluster, dtype: int64
newdata.write("../processedData/alldata.obj/newly-generated-data-fixed-obs.h5ad")
... storing 'patient' as categorical ... storing 'tissue' as categorical ... storing 'Sub_Cluster' as categorical ... storing 'tech' as categorical ... storing 'cancer' as categorical ... storing 'tech_10X' as categorical ... storing 'MajorCluster' as categorical
# merge into one object.
alldata = newdata.concatenate(CRC_ann)
print(alldata.obs['cancer'].value_counts())
CRC 13795 UCEC 8808 ESCA 7673 MYE 7619 OV-FTC 3888 PAAD 2853 LYM 615 Name: cancer, dtype: int64
alldata.obs
| patient | tissue | percent_hsp | percent_mito | n_genes | n_counts | Sub_Cluster | tech | cancer | tech_10X | MajorCluster | batch | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|
| AACACGTGTTTGGGCC-15-0-0 | P20181123 | T | 0.006040 | 0.000000 | 4789 | 29802.000000 | M02_cDC1_CLEC9A | 10X | LYM | 10X5 | cDC | 0 |
| AACCGCGAGTTCGCAT-15-0-0 | P20181123 | T | 0.003356 | 0.000000 | 1641 | 5065.000000 | M06_Macro_ISG15 | 10X | LYM | 10X5 | Mono/Macro | 0 |
| AACTGGTAGCCACTAT-15-0-0 | P20181123 | T | 0.000368 | 0.000000 | 922 | 2716.000000 | M06_Macro_ISG15 | 10X | LYM | 10X5 | Mono/Macro | 0 |
| AAGGCAGTCGTCGTTC-15-0-0 | P20181123 | T | 0.004203 | 0.000000 | 1511 | 4759.000000 | M06_Macro_ISG15 | 10X | LYM | 10X5 | Mono/Macro | 0 |
| AAGGTTCAGATAGGAG-15-0-0 | P20181123 | T | 0.016225 | 0.000000 | 1350 | 3698.000000 | M03_cDC2_CD1C | 10X | LYM | 10X5 | cDC | 0 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| P_T_P1228_10676-1 | P1228 | T | 0.335713 | 0.591267 | 4506 | 16373.216968 | Mono-CD14 | Smart-seq2 | CRC | SmartSeq2 | Mono/Macro | 1 |
| P_T_P1228_10778-1 | P1228 | T | 0.544217 | 0.959686 | 4321 | 14662.114240 | Mast-TPSAB1 | Smart-seq2 | CRC | SmartSeq2 | Mast | 1 |
| P_T_P1228_10784-1 | P1228 | T | 0.451440 | 0.732953 | 4577 | 17858.945581 | TAM-C1QC | Smart-seq2 | CRC | SmartSeq2 | Mono/Macro | 1 |
| P_T_P1228_10799-1 | P1228 | T | 0.482179 | 0.742669 | 4065 | 15981.777171 | TAM-C1QC | Smart-seq2 | CRC | SmartSeq2 | Mono/Macro | 1 |
| P_T_P1228_10800-1 | P1228 | T | 0.816831 | 0.950539 | 4574 | 14047.981586 | TAM-C1QC | Smart-seq2 | CRC | SmartSeq2 | Mono/Macro | 1 |
45251 rows × 12 columns
Then, effects of the total count per cell, the percentage of mitochondrial gene count and the percentage of count for heat shock protein associated genes (HSP) were regressed out by using scanpy.pp.regress_out function.
This is achieved by doing a generalized linear regression using these parameters as covariates in the model. Then the residuals of the model are taken as the "regressed data".
# regress out unwanted variables
sc.pp.regress_out(alldata, ['n_counts', 'percent_mito','percent_hsp'])
regressing out ['n_counts', 'percent_mito', 'percent_hsp']
finished (0:35:27.53)
alldata.write("../processedData/alldata.obj/regressed-alldata.h5ad")
alldata.obs.to_csv("../processedData/alldata.obj/alldata-metadata.csv",sep=',',index=True,header=True)
# Store the full matrix in the raw slot before doing variable gene selection
alldata.raw = alldata
We need to find genes that are highly variable across cells, which in turn will also provide a good separation of the cell clusters.
In brief, 2,000 highly-variable genes were selected for downstream analysis by using scanpy.pp.highly_variable_genes function with parameter “n_top_genes=2000.”
# compute variable genes
sc.pp.highly_variable_genes(alldata, min_mean=0.0125, max_mean=3, min_disp=0.5, n_top_genes=2000,flavor='seurat')
print("Highly variable genes: %d"%sum(alldata.var.highly_variable))
#plot variable genes
sc.pl.highly_variable_genes(alldata)
# subset for variable genes in the dataset
alldata = alldata[:, alldata.var['highly_variable']]
If you pass `n_top_genes`, all cutoffs are ignored.
--> added
'highly_variable', boolean vector (adata.var)
'means', float vector (adata.var)
'dispersions', float vector (adata.var)
'dispersions_norm', float vector (adata.var)
Highly variable genes: 1999
alldata.write("../processedData/alldata.obj/compute-HVG-alldata.h5ad")
alldata = sc.read_h5ad("../processedData/alldata.obj/compute-HVG-alldata.h5ad")
A principal component analysis (PCA) matrix with 100 components were calculated to reveal the main axes of variation and denoise the data by using scanpy.tl.pca function with parameter “svd_solver='arpack', n_comps=100. For visualization, the dimensionality of each dataset was further reduced using Uniform Manifold Approximation and Projection (UMAP) implemented in scanpy.tl.umap function with default parameters.
sc.tl.pca(alldata, svd_solver='arpack', n_comps=100)
computing PCA with n_comps = 100
computing PCA on highly variable genes
finished (0:00:20.51)
sc.pp.neighbors(alldata, n_pcs=100)
sc.tl.umap(alldata)
computing neighbors
using 'X_pca' with n_pcs = 100
/home/data/vip8t02/anaconda3/envs/scanpy-1.4.3/lib/python3.9/site-packages/scanpy/neighbors/__init__.py:88: DeprecationWarning: Use is_view instead of isview, isview will be removed in the future. if adata.isview: # we shouldn't need this here...
finished (0:00:25.29) --> added to `.uns['neighbors']`
'distances', distances for each pair of neighbors
'connectivities', weighted adjacency matrix
computing UMAP
using 'X_pca' with n_pcs = 100
finished (0:00:32.08) --> added
'X_umap', UMAP coordinates (adata.obsm)
sc.pl.umap(alldata, color=['patient','MajorCluster'], ncols=1)
sc.pl.umap(alldata, color=['tech_10X','MajorCluster'],wspace=.4)
To remove the batch effects from different donors, we applied bbknn algorithm with parameter “batch_key='patient', n_pcs=100” to obtain a batch-corrected space.
import bbknn
bbknn.bbknn(alldata, batch_key='patient',n_pcs=100)
computing batch balanced neighbors finished (0:01:03.37) --> added to `.uns['neighbors']` 'distances', weighted adjacency matrix 'connectivities', weighted adjacency matrix
alldata.write("../processedData/alldata.obj/bbknn-completed.h5ad")
sc.tl.umap(alldata)
sc.pl.umap(alldata, color=['patient','MajorCluster'], ncols=1)
computing UMAP
using 'X_pca' with n_pcs = 100
finished (0:01:06.63) --> added
'X_umap', UMAP coordinates (adata.obsm)
We run two rounds of Scanorama (Hie et al., 2019), an algorithm that could identify and merge shared cell types among multiple datasets, to remove the batch effects within scRNA-seq datasets of 15 cancer types. First, we applied Scanorama to datasets generated from 3′ library and 5′ library from 10x Genomics to remove the batch effects attribute to these two protocols. Then, a second-round of Scanorama was applied to remove the batch effects resulting from the diverse platforms, including 10x Genomics, MARS-Seq and inDrop.
Create individual AnnData objects from each of the datasets.
alldata2 = alldata.raw.to_adata()
# split per batch based on tech
batches = alldata.obs['tech'].cat.categories.tolist()
data_tech = {}
for batch in batches:
data_tech[batch] = alldata2[alldata2.obs['tech'] == batch,]
data_tech
{'10X': View of AnnData object with n_obs × n_vars = 44019 × 10390
obs: 'patient', 'tissue', 'percent_hsp', 'percent_mito', 'n_genes', 'n_counts', 'Sub_Cluster', 'tech', 'cancer', 'tech_10X', 'MajorCluster', 'batch'
var: 'genes_index-0', 'features-1'
uns: 'pca', 'neighbors', 'patient_colors', 'MajorCluster_colors'
obsm: 'X_pca', 'X_umap',
'Smart-seq2': View of AnnData object with n_obs × n_vars = 1232 × 10390
obs: 'patient', 'tissue', 'percent_hsp', 'percent_mito', 'n_genes', 'n_counts', 'Sub_Cluster', 'tech', 'cancer', 'tech_10X', 'MajorCluster', 'batch'
var: 'genes_index-0', 'features-1'
uns: 'pca', 'neighbors', 'patient_colors', 'MajorCluster_colors'
obsm: 'X_pca', 'X_umap'}
# split per batch into 10X objects.
batches = alldata.obs['tech_10X'].cat.categories.tolist()
del batches[2]
data_10X = {}
for batch in batches:
data_10X[batch] = alldata2[alldata2.obs['tech_10X'] == batch,]
data_10X
{'10X3': View of AnnData object with n_obs × n_vars = 12563 × 10390
obs: 'patient', 'tissue', 'percent_hsp', 'percent_mito', 'n_genes', 'n_counts', 'Sub_Cluster', 'tech', 'cancer', 'tech_10X', 'MajorCluster', 'batch'
var: 'genes_index-0', 'features-1'
uns: 'pca', 'neighbors', 'patient_colors', 'MajorCluster_colors'
obsm: 'X_pca', 'X_umap',
'10X5': View of AnnData object with n_obs × n_vars = 31456 × 10390
obs: 'patient', 'tissue', 'percent_hsp', 'percent_mito', 'n_genes', 'n_counts', 'Sub_Cluster', 'tech', 'cancer', 'tech_10X', 'MajorCluster', 'batch'
var: 'genes_index-0', 'features-1'
uns: 'pca', 'neighbors', 'patient_colors', 'MajorCluster_colors'
obsm: 'X_pca', 'X_umap'}
data_10X_ann = alldata2[alldata2.obs["tech_10X"] != "SmartSeq2", :]
data_10X_ann
View of AnnData object with n_obs × n_vars = 44019 × 10390
obs: 'patient', 'tissue', 'percent_hsp', 'percent_mito', 'n_genes', 'n_counts', 'Sub_Cluster', 'tech', 'cancer', 'tech_10X', 'MajorCluster', 'batch'
var: 'genes_index-0', 'features-1'
uns: 'pca', 'neighbors', 'patient_colors', 'MajorCluster_colors'
obsm: 'X_pca', 'X_umap'
# split per batch into SmartSeq2 objects.
data_SS2 = {}
data_SS2['SmartSeq2'] = alldata2[alldata2.obs['tech_10X'] == 'SmartSeq2',]
data_SS2
{'SmartSeq2': View of AnnData object with n_obs × n_vars = 1232 × 10390
obs: 'patient', 'tissue', 'percent_hsp', 'percent_mito', 'n_genes', 'n_counts', 'Sub_Cluster', 'tech', 'cancer', 'tech_10X', 'MajorCluster', 'batch'
var: 'genes_index-0', 'features-1'
uns: 'pca', 'neighbors', 'patient_colors', 'MajorCluster_colors'
obsm: 'X_pca', 'X_umap'}
data_SS2_ann = alldata2[alldata2.obs["tech_10X"] == "SmartSeq2", :]
data_SS2_ann
View of AnnData object with n_obs × n_vars = 1232 × 10390
obs: 'patient', 'tissue', 'percent_hsp', 'percent_mito', 'n_genes', 'n_counts', 'Sub_Cluster', 'tech', 'cancer', 'tech_10X', 'MajorCluster', 'batch'
var: 'genes_index-0', 'features-1'
uns: 'pca', 'neighbors', 'patient_colors', 'MajorCluster_colors'
obsm: 'X_pca', 'X_umap'
Detect HVG
# data_10X_ann2 = data_10X_ann.raw.to_adata()
# check that the matrix looks like noramlized counts
print(data_10X_ann.X[1:5,1:5])
[[-4.0733621e-02 -2.3683284e-01 1.1952481e+00 -7.0990114e-03] [ 3.0918663e-02 -1.9151421e-01 -2.9681513e-01 8.6839534e-03] [-3.1718317e-02 -2.3126537e-01 -4.1332448e-01 -5.1082806e-03] [ 1.3939840e+00 -2.1443358e-01 -3.6623287e-01 1.3139478e-03]]
var_genes_all = alldata.var.highly_variable
print("Highly variable genes: %d"%sum(var_genes_all))
# use batch_key to detect variable genes in each dataset separately
sc.pp.highly_variable_genes(data_10X_ann, min_mean=0.0125, max_mean=3, min_disp=0.5, batch_key = 'tech_10X')
print("Highly variable genes intersection: %d"%sum(data_10X_ann.var.highly_variable_intersection))
print("Number of batches where gene is variable:")
print(data_10X_ann.var.highly_variable_nbatches.value_counts())
var_genes_batch = data_10X_ann.var.highly_variable_nbatches > 0
Highly variable genes: 1999
Trying to set attribute `.var` of view, copying.
--> added
'highly_variable', boolean vector (adata.var)
'means', float vector (adata.var)
'dispersions', float vector (adata.var)
'dispersions_norm', float vector (adata.var)
Highly variable genes intersection: 272
Number of batches where gene is variable:
0 8896
1 1222
2 272
Name: highly_variable_nbatches, dtype: int64
Compare overlap of the variable genes.
print("Any batch var genes: %d"%sum(var_genes_batch))
print("All data var genes: %d"%sum(var_genes_all))
print("Overlap: %d"%sum(var_genes_batch & var_genes_all))
print("Variable genes in all batches: %d"%sum(data_10X_ann.var.highly_variable_nbatches == 2))
print("Overlap batch instersection and all: %d"%sum(var_genes_all & data_10X_ann.var.highly_variable_intersection))
Any batch var genes: 1494 All data var genes: 1999 Overlap: 330 Variable genes in all batches: 272 Overlap batch instersection and all: 76
Select all genes that are variable in at least 1 datasets and use for remaining analysis.
var_select = data_10X_ann.var.highly_variable_nbatches >= 1
var_genes = var_select.index[var_select]
len(var_genes)
1494
run Scanorama
import six
import sys
sys.modules['sklearn.externals.six'] = six
import scanorama
data_10X2 = dict()
for ds in data_10X.keys():
print(ds)
data_10X2[ds] = data_10X[ds][:,var_genes]
#convert to list of AnnData objects
data_10X2_adatas = list(data_10X2.values())
# run scanorama.integrate
scanorama.integrate_scanpy(data_10X2_adatas, dimred = 100)
10X3 10X5 Found 1494 genes among all datasets [[0. 0.52177028] [0. 0. ]] Processing datasets (0, 1)
#scanorama adds the corrected matrix to adata.obsm in each of the datasets in adatas.
data_10X2_adatas[0].obsm['X_scanorama'].shape
(12563, 100)
# Get all the integrated matrices.
scanorama_int = [ad.obsm['X_scanorama'] for ad in data_10X2_adatas]
# make into one matrix.
all_s = np.concatenate(scanorama_int)
print(all_s.shape)
# add to the AnnData object
data_10X_ann.obsm["Scanorama"] = all_s
(44019, 100)
# tsne and umap
sc.pp.neighbors(data_10X_ann, n_pcs =100, use_rep = "Scanorama")
sc.tl.umap(data_10X_ann)
computing neighbors
/home/data/vip8t02/anaconda3/envs/scanpy-1.4.3/lib/python3.9/site-packages/scanpy/neighbors/__init__.py:88: DeprecationWarning: Use is_view instead of isview, isview will be removed in the future. if adata.isview: # we shouldn't need this here...
finished (0:00:12.26) --> added to `.uns['neighbors']`
'distances', distances for each pair of neighbors
'connectivities', weighted adjacency matrix
computing UMAP
finished (0:00:31.61) --> added
'X_umap', UMAP coordinates (adata.obsm)
fig, axs = plt.subplots(1, 2, figsize=(12,5),constrained_layout=True)
sc.pl.umap(data_10X_ann, color="MajorCluster", title="Scanorama ROUND1 umap", ax=axs[0], show=False)
sc.pl.umap(data_10X_ann, color="cancer", title="Scanorama ROUND1 umap", ax=axs[1], show=False)
<AxesSubplot:title={'center':'Scanorama ROUND1 umap'}, xlabel='UMAP1', ylabel='UMAP2'>
# Merge data
techdata = data_10X_ann.concatenate(data_SS2_ann)
techdata
AnnData object with n_obs × n_vars = 45251 × 10390
obs: 'patient', 'tissue', 'percent_hsp', 'percent_mito', 'n_genes', 'n_counts', 'Sub_Cluster', 'tech', 'cancer', 'tech_10X', 'MajorCluster', 'batch'
var: 'genes_index-0', 'highly_variable-0', 'means-0', 'dispersions-0', 'dispersions_norm-0', 'highly_variable_nbatches-0', 'highly_variable_intersection-0', 'features-1'
obsm: 'X_pca', 'X_umap'
Detect HVG
# check that the matrix looks like noramlized counts
print(techdata.X[1:5,1:5])
[[-4.0733621e-02 -2.3683284e-01 1.1952481e+00 -7.0990114e-03] [ 3.0918663e-02 -1.9151421e-01 -2.9681513e-01 8.6839534e-03] [-3.1718317e-02 -2.3126537e-01 -4.1332448e-01 -5.1082806e-03] [ 1.3939840e+00 -2.1443358e-01 -3.6623287e-01 1.3139478e-03]]
var_genes_all = alldata.var.highly_variable
print("Highly variable genes: %d"%sum(var_genes_all))
# use batch_key to detect variable genes in each dataset separately
sc.pp.highly_variable_genes(techdata, min_mean=0.0125, max_mean=3, min_disp=0.5, batch_key = 'tech')
print("Highly variable genes intersection: %d"%sum(techdata.var.highly_variable_intersection))
print("Number of batches where gene is variable:")
print(techdata.var.highly_variable_nbatches.value_counts())
var_genes_batch = techdata.var.highly_variable_nbatches > 0
... storing 'patient' as categorical ... storing 'Sub_Cluster' as categorical ... storing 'tech' as categorical ... storing 'cancer' as categorical ... storing 'tech_10X' as categorical
Highly variable genes: 1999
--> added
'highly_variable', boolean vector (adata.var)
'means', float vector (adata.var)
'dispersions', float vector (adata.var)
'dispersions_norm', float vector (adata.var)
Highly variable genes intersection: 4
Number of batches where gene is variable:
0 9232
1 1154
2 4
Name: highly_variable_nbatches, dtype: int64
Compare overlap of the variable genes.
print("Any batch var genes: %d"%sum(var_genes_batch))
print("All data var genes: %d"%sum(var_genes_all))
print("Overlap: %d"%sum(var_genes_batch & var_genes_all))
print("Variable genes in all batches: %d"%sum(techdata.var.highly_variable_nbatches == 2))
print("Overlap batch instersection and all: %d"%sum(var_genes_all & techdata.var.highly_variable_intersection))
Any batch var genes: 1158 All data var genes: 1999 Overlap: 337 Variable genes in all batches: 4 Overlap batch instersection and all: 3
Select all genes that are variable in at least 1 datasets and use for remaining analysis.
var_select = techdata.var.highly_variable_nbatches >= 1
var_genes = var_select.index[var_select]
len(var_genes)
1158
run Scanorama
techdata3 = dict()
for ds in data_tech.keys():
print(ds)
techdata3[ds] = data_tech[ds][:,var_genes]
#convert to list of AnnData objects
tech_adatas = list(techdata3.values())
# run scanorama.integrate
scanorama.integrate_scanpy(tech_adatas, dimred = 100)
10X Smart-seq2 Found 1158 genes among all datasets [[0. 0.94886364] [0. 0. ]] Processing datasets (0, 1)
#scanorama adds the corrected matrix to adata.obsm in each of the datasets in adatas.
tech_adatas[0].obsm['X_scanorama'].shape
(44019, 100)
# Get all the integrated matrices.
scanorama_int = [ad.obsm['X_scanorama'] for ad in tech_adatas]
# make into one matrix.
all_s = np.concatenate(scanorama_int)
print(all_s.shape)
# add to the AnnData object
techdata.obsm["Scanorama"] = all_s
(45251, 100)
# tsne and umap
sc.pp.neighbors(techdata, n_pcs =100, use_rep = "Scanorama")
sc.tl.umap(techdata)
computing neighbors
/home/data/vip8t02/anaconda3/envs/scanpy-1.4.3/lib/python3.9/site-packages/scanpy/neighbors/__init__.py:88: DeprecationWarning: Use is_view instead of isview, isview will be removed in the future. if adata.isview: # we shouldn't need this here...
finished (0:00:12.92) --> added to `.uns['neighbors']`
'distances', distances for each pair of neighbors
'connectivities', weighted adjacency matrix
computing UMAP
finished (0:00:27.35) --> added
'X_umap', UMAP coordinates (adata.obsm)
fig, axs = plt.subplots(1, 2, figsize=(12,5),constrained_layout=True)
sc.pl.umap(techdata, color="MajorCluster", title="Scanorama ROUND2 umap", ax=axs[0], show=False)
sc.pl.umap(techdata, color="cancer", title="Scanorama ROUND2 umap", ax=axs[1], show=False)
<AxesSubplot:title={'center':'Scanorama ROUND2 umap'}, xlabel='UMAP1', ylabel='UMAP2'>
techdata.write("../processedData/alldata.obj/finish-integration-with-umap.h5ad")
import IPython
print(IPython.sys_info())
{'commit_hash': '24512bd29',
'commit_source': 'installation',
'default_encoding': 'utf-8',
'ipython_path': '/home/data/vip8t02/anaconda3/envs/scanpy-1.4.3/lib/python3.9/site-packages/IPython',
'ipython_version': '7.27.0',
'os_name': 'posix',
'platform': 'Linux-5.11.0-27-generic-x86_64-with-glibc2.31',
'sys_executable': '/home/data/vip8t02/anaconda3/envs/scanpy-1.4.3/bin/python',
'sys_platform': 'linux',
'sys_version': '3.9.7 (default, Sep 16 2021, 13:09:58) \n[GCC 7.5.0]'}